In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib import figure

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)

What impacts the gross revenue?

In [2]:
df = pd.read_csv('movies.csv')
df.rename(columns={'gross':'revenue'}, inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7668 entries, 0 to 7667
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      7668 non-null   object 
 1   rating    7591 non-null   object 
 2   genre     7668 non-null   object 
 3   year      7668 non-null   int64  
 4   released  7666 non-null   object 
 5   score     7665 non-null   float64
 6   votes     7665 non-null   float64
 7   director  7668 non-null   object 
 8   writer    7665 non-null   object 
 9   star      7667 non-null   object 
 10  country   7665 non-null   object 
 11  budget    5497 non-null   float64
 12  revenue   7479 non-null   float64
 13  company   7651 non-null   object 
 14  runtime   7664 non-null   float64
dtypes: float64(5), int64(1), object(9)
memory usage: 898.7+ KB
In [3]:
#Checking for missing data more closely

for col in df.columns:
    pct_missing = round((df[col].isnull().sum()/df[col].shape[0])*100)
    missing = np.sum(df[col].isnull())
    print('{} - {} - {}%'.format(col, missing, pct_missing))
name - 0 - 0.0%
rating - 77 - 1.0%
genre - 0 - 0.0%
year - 0 - 0.0%
released - 2 - 0.0%
score - 3 - 0.0%
votes - 3 - 0.0%
director - 0 - 0.0%
writer - 3 - 0.0%
star - 1 - 0.0%
country - 3 - 0.0%
budget - 2171 - 28.0%
revenue - 189 - 2.0%
company - 17 - 0.0%
runtime - 4 - 0.0%

The 'Budget' column has too many missing values to replace them with the mean. We will restrict our dataframe to only include movies that have the budget. The new dataframe of reference will be 'df2'.

In [4]:
df2 = df.loc[df['budget'].isna()==False]
df2.shape
Out[4]:
(5497, 15)
In [5]:
df2.isna().sum()
Out[5]:
name         0
rating      20
genre        0
year         0
released     0
score        3
votes        3
director     0
writer       0
star         1
country      1
budget       0
revenue     61
company      6
runtime      2
dtype: int64

Filling NaN in 'revenue' column

gross_mean = df2['revenue'].mean() df2['revenue'].fillna(gross_mean, inplace=True)

In [6]:
# fixing columns of interest
df2["year"] = pd.to_datetime(df2["year"], format='%Y')
df2.dtypes
<ipython-input-6-449eec0cc815>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["year"] = pd.to_datetime(df2["year"], format='%Y')
Out[6]:
name                object
rating              object
genre               object
year        datetime64[ns]
released            object
score              float64
votes              float64
director            object
writer              object
star                object
country             object
budget             float64
revenue            float64
company             object
runtime            float64
dtype: object
In [7]:
df2 = df2.copy()

# Setting multiple items using a mask
mask = df2['revenue'].isna()
df2.loc[mask] = df2['revenue'].mean()
In [ ]:
 
In [8]:
#Checking for data types
df2.head()
Out[8]:
name rating genre year released score votes director writer star country budget revenue company runtime
0 The Shining R Drama 1980-01-01 00:00:00 June 13, 1980 (United States) 8.4 927000.0 Stanley Kubrick Stephen King Jack Nicholson United Kingdom 19000000.0 46998772.0 Warner Bros. 146.0
1 The Blue Lagoon R Adventure 1980-01-01 00:00:00 July 2, 1980 (United States) 5.8 65000.0 Randal Kleiser Henry De Vere Stacpoole Brooke Shields United States 4500000.0 58853106.0 Columbia Pictures 104.0
2 Star Wars: Episode V - The Empire Strikes Back PG Action 1980-01-01 00:00:00 June 20, 1980 (United States) 8.7 1200000.0 Irvin Kershner Leigh Brackett Mark Hamill United States 18000000.0 538375067.0 Lucasfilm 124.0
3 Airplane! PG Comedy 1980-01-01 00:00:00 July 2, 1980 (United States) 7.7 221000.0 Jim Abrahams Jim Abrahams Robert Hays United States 3500000.0 83453539.0 Paramount Pictures 88.0
4 Caddyshack R Comedy 1980-01-01 00:00:00 July 25, 1980 (United States) 7.3 108000.0 Harold Ramis Brian Doyle-Murray Chevy Chase United States 6000000.0 39846344.0 Orion Pictures 98.0
In [9]:
# fixing columns of interest
df2['revenue'] = df2['revenue'].astype('int64')
df2['budget'] = df2['budget'].astype('int64')
df2.head(2)
Out[9]:
name rating genre year released score votes director writer star country budget revenue company runtime
0 The Shining R Drama 1980-01-01 00:00:00 June 13, 1980 (United States) 8.4 927000.0 Stanley Kubrick Stephen King Jack Nicholson United Kingdom 19000000 46998772 Warner Bros. 146.0
1 The Blue Lagoon R Adventure 1980-01-01 00:00:00 July 2, 1980 (United States) 5.8 65000.0 Randal Kleiser Henry De Vere Stacpoole Brooke Shields United States 4500000 58853106 Columbia Pictures 104.0
In [10]:
df2.sort_values(by=['revenue'], inplace=True, ascending=False)
# Drop duplicates
df2.drop_duplicates(inplace=True)
df2.head(5)
Out[10]:
name rating genre year released score votes director writer star country budget revenue company runtime
5445 Avatar PG-13 Action 2009-01-01 00:00:00 December 18, 2009 (United States) 7.8 1100000.0 James Cameron James Cameron Sam Worthington United States 237000000 2847246203 Twentieth Century Fox 162.0
7445 Avengers: Endgame PG-13 Action 2019-01-01 00:00:00 April 26, 2019 (United States) 8.4 903000.0 Anthony Russo Christopher Markus Robert Downey Jr. United States 356000000 2797501328 Marvel Studios 181.0
3045 Titanic PG-13 Drama 1997-01-01 00:00:00 December 19, 1997 (United States) 7.8 1100000.0 James Cameron James Cameron Leonardo DiCaprio United States 200000000 2201647264 Twentieth Century Fox 194.0
6663 Star Wars: Episode VII - The Force Awakens PG-13 Action 2015-01-01 00:00:00 December 18, 2015 (United States) 7.8 876000.0 J.J. Abrams Lawrence Kasdan Daisy Ridley United States 245000000 2069521700 Lucasfilm 138.0
7244 Avengers: Infinity War PG-13 Action 2018-01-01 00:00:00 April 27, 2018 (United States) 8.4 897000.0 Anthony Russo Christopher Markus Robert Downey Jr. United States 321000000 2048359754 Marvel Studios 149.0

Let's get an overview on the correlations for the numerical data in the dataset

In [11]:
pd.plotting.scatter_matrix(df2, alpha=0.2);
In [12]:
correlation_matrix = df2.corr(method='pearson')
correlation_matrix
Out[12]:
score votes budget revenue runtime
score 1.000000e+00 0.991399 0.021888 1.531468e-07 1.000000
votes 9.913985e-01 1.000000 0.079275 8.047743e-02 0.991399
budget 2.188827e-02 0.079275 1.000000 7.402175e-01 0.021893
revenue 1.531468e-07 0.080477 0.740218 1.000000e+00 0.000003
runtime 1.000000e+00 0.991399 0.021893 2.708556e-06 1.000000
In [13]:
sns.heatmap(correlation_matrix, annot=True)

plt.title('Correlation Matrix for Numeric features')
plt.xlabel('Features')
plt.ylabel('Features')

print("⚠️NOTE: The brighter the box, the higher the correlation.")
⚠️NOTE: The brighter the box, the higher the correlation.

It looks like higher budgets for a movie result in both higher revenues and better votes. Let's dive deeper.

We will explore the following:

  • 'Budget vs Gross Revenue'
  • 'Budget vs Votes'

And then we will use a bubble chart to graphically assess the relationship between the three variables at once, depending on the movie genre.

  • 'Budget vs Gross Revenue vs Votes'

Budget vs Gross Revenue

In [14]:
plt.scatter(x=df2['budget'], y=df2['revenue'])
plt.title('Budget vs Gross Revenue')
plt.xlabel('Budget')
plt.ylabel('Gross Revenue');
In [15]:
# Plot budget vs gross using seaborn
sns.regplot(x='budget', y='revenue',data=df2, scatter_kws={"color":"red"},line_kws={'color':"blue"})
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa1b591e910>

Budget vs Votes

In [16]:
plt.scatter(x=df2['budget'], y=df2['votes'])
plt.title('Budget vs Votes')
plt.xlabel('Budget')
plt.ylabel('Votes');
In [17]:
sns.regplot(x='budget', y='votes',data=df2, scatter_kws={"color":"red"},line_kws={'color':"green"})
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa1b6104490>

Let's look at how the three variables interplay, based on each genre, using the bubble chart

In [18]:
df2_restricted = df2[["genre", "budget", "revenue", "votes"]]

df2_grouped = df2_restricted.groupby('genre').mean()

for col in df2_grouped.columns:
    df2_grouped[col] = df2_grouped[col].astype('int64')
    
df2_grouped
Out[18]:
budget revenue votes
genre
103004458.12490802 103004458 103004458 103004458
Action 58415252 167813008 163182
Adventure 45958899 133268232 123237
Animation 76052410 280124624 147542
Biography 25360832 61207107 115714
Comedy 22802371 59167658 71959
Crime 22571390 50082281 113498
Drama 23190840 60231437 107395
Family 51125000 518583146 173225
Fantasy 16885714 39308611 61166
Horror 13299476 56149645 81207
Mystery 31876470 117887733 266647
Romance 24040000 33575381 67720
Sci-Fi 24416666 37610102 107283
Thriller 12742857 42488141 31500
Western 10500000 10012943 2850
In [19]:
import plotly.express as px
fig = px.scatter(df2_grouped, x="budget", y="votes", size="revenue", color=df2_grouped.index, hover_name=df2_grouped.index, log_x=True, size_max=60)
fig.show()

Well...the more money you pump into your movie, the higher the votes it receives! It may have have something to do with the amount of advertising involved, which is something we are not considering in this analysis.

In any case, the correlation between budget and revenue is also quite strong. It looks like as the budget increases, the size of the bubbles (that indicates the revenue) also tends to get bigger.

That is especially true for Action movies and Animation movies. No surprise there...they are my favorite genres!

In [20]:
df2_grouped.corr()
Out[20]:
budget revenue votes
budget 1.000000 0.510124 0.705092
revenue 0.510124 1.000000 -0.014710
votes 0.705092 -0.014710 1.000000

Do adventure movies yield higher revenues than action movies on average?

I really like action movies, and I would like them to make more money than adventure movies do. Let's see if, on average, the revenue for action movies is grater than that for adventure movies.

We will use bootstrapping to simulate 10000 samples of 1200 items from the df2 dataset. We will display the distribution of the sample mean in an histogram.

Sample size determination

We want to be 95% sure that we will be coorect within a +- error. The formula for determining the sample size for comparing means is:

Screen%20Shot%202022-08-14%20at%2010.35.13.png

In our case, we need:

In [21]:
z_score = 1.96
std = np.std(df2_grouped["revenue"])
err = 0.5
sample_size = (z_score) * (std**2) / err**2

sample_size
Out[21]:
1.1977349573567747e+17
In [22]:
df2
Out[22]:
name rating genre year released score votes director writer star country budget revenue company runtime
5445 Avatar PG-13 Action 2009-01-01 00:00:00 December 18, 2009 (United States) 7.8 1100000.0 James Cameron James Cameron Sam Worthington United States 237000000 2847246203 Twentieth Century Fox 162.0
7445 Avengers: Endgame PG-13 Action 2019-01-01 00:00:00 April 26, 2019 (United States) 8.4 903000.0 Anthony Russo Christopher Markus Robert Downey Jr. United States 356000000 2797501328 Marvel Studios 181.0
3045 Titanic PG-13 Drama 1997-01-01 00:00:00 December 19, 1997 (United States) 7.8 1100000.0 James Cameron James Cameron Leonardo DiCaprio United States 200000000 2201647264 Twentieth Century Fox 194.0
6663 Star Wars: Episode VII - The Force Awakens PG-13 Action 2015-01-01 00:00:00 December 18, 2015 (United States) 7.8 876000.0 J.J. Abrams Lawrence Kasdan Daisy Ridley United States 245000000 2069521700 Lucasfilm 138.0
7244 Avengers: Infinity War PG-13 Action 2018-01-01 00:00:00 April 27, 2018 (United States) 8.4 897000.0 Anthony Russo Christopher Markus Robert Downey Jr. United States 321000000 2048359754 Marvel Studios 149.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2434 Philadelphia Experiment II PG-13 Action 1993-01-01 00:00:00 June 4, 1994 (South Korea) 4.5 1900.0 Stephen Cornwell Wallace C. Bennett Brad Johnson United States 5000000 2970 Trimark Pictures 97.0
3681 Ginger Snaps Not Rated Drama 2000-01-01 00:00:00 May 11, 2001 (Canada) 6.8 43000.0 John Fawcett Karen Walton Emily Perkins Canada 5000000 2554 Copperheart Entertainment 108.0
272 Parasite R Horror 1982-01-01 00:00:00 March 12, 1982 (United States) 3.9 2300.0 Charles Band Alan J. Adler Robert Glaudini United States 800000 2270 Embassy Pictures 85.0
2417 Madadayo NaN Drama 1993-01-01 00:00:00 April 17, 1993 (Japan) 7.3 5100.0 Akira Kurosawa Ishirô Honda Tatsuo Matsumura Japan 11900000 596 DENTSU Music And Entertainment 134.0
3203 Trojan War PG-13 Comedy 1997-01-01 00:00:00 October 1, 1997 (Brazil) 5.7 5800.0 George Huang Andy Burg Will Friedle United States 15000000 309 Daybreak 85.0

5437 rows × 15 columns

In [23]:
diff = []
for n in range(10000):
    bootstrap = df2.sample(1200, replace=True)
    revenue_action = round(bootstrap[bootstrap['genre']=='Action']['revenue'].mean())
    revenue_adventure = round(bootstrap[bootstrap['genre']=='Adventure']['revenue'].mean())
    diff.append(revenue_action - revenue_adventure)

plt.hist(diff);
In [24]:
np.percentile(bootstrap['revenue'], 2.5), np.percentile(bootstrap['revenue'], 97.5), np.mean(diff)
Out[24]:
(452921.15, 611388946.3249993, 34269898.2273)

We can be 95% confident that there is a difference in the average revenue for action movies and for adventure movies.

A quick look at categorical variables

Let's convert all non-numeric data to numeric data, and store that in a new dataframe, 'df_numerised'

In [25]:
df_numerised = df2

for col in df_numerised.columns:
    if(df_numerised[col].dtype == 'object'):
        df_numerised[col] = df_numerised[col].astype('category')
        df_numerised[col] = df_numerised[col].cat.codes
In [26]:
correlation_matrix_2 = df_numerised.corr(method='pearson')
correlation_matrix_2
Out[26]:
name rating genre year released score votes director writer star country budget revenue company runtime
name 1.000000 -0.019973 0.010138 -0.017417 -0.005342 -2.352816e-02 -0.021636 0.014254 0.012159 -0.006551 -0.023120 0.023171 5.964132e-03 0.021873 -0.023528
rating -0.019973 1.000000 0.125641 -0.011730 0.020621 -7.889296e-02 -0.076957 0.016655 -0.005568 0.004661 0.026442 -0.184195 -1.651883e-01 -0.077689 -0.078892
genre 0.010138 0.125641 1.000000 0.054213 0.022224 -2.232985e-02 -0.039993 -0.008764 0.018025 0.003144 -0.009568 -0.369209 -2.443186e-01 -0.072600 -0.022337
year -0.017417 -0.011730 0.054213 1.000000 -0.002991 2.759804e-02 0.004663 0.026512 0.012400 0.017599 0.048430 -0.266410 -2.282175e-01 0.005697 0.027598
released -0.005342 0.020621 0.022224 -0.002991 1.000000 -2.342722e-02 -0.019428 0.003899 0.009069 0.015907 -0.015838 0.019365 8.611611e-03 -0.002250 -0.023429
score -0.023528 -0.078893 -0.022330 0.027598 -0.023427 1.000000e+00 0.991399 -0.023898 -0.023930 -0.023581 -0.057982 0.021888 1.531468e-07 -0.028843 1.000000
votes -0.021636 -0.076957 -0.039993 0.004663 -0.019428 9.913985e-01 1.000000 -0.025027 -0.024430 -0.025621 -0.051976 0.079275 8.047743e-02 -0.012979 0.991399
director 0.014254 0.016655 -0.008764 0.026512 0.003899 -2.389826e-02 -0.025027 1.000000 0.263362 0.036832 0.012455 -0.009859 -2.935779e-02 -0.006643 -0.023898
writer 0.012159 -0.005568 0.018025 0.012400 0.009069 -2.393050e-02 -0.024430 0.263362 1.000000 0.019827 0.023483 -0.039924 -3.598868e-02 -0.001965 -0.023931
star -0.006551 0.004661 0.003144 0.017599 0.015907 -2.358116e-02 -0.025621 0.036832 0.019827 1.000000 -0.008717 -0.021787 -9.296669e-06 0.015757 -0.023581
country -0.023120 0.026442 -0.009568 0.048430 -0.015838 -5.798207e-02 -0.051976 0.012455 0.023483 -0.008717 1.000000 0.053008 6.083175e-02 0.051130 -0.057982
budget 0.023171 -0.184195 -0.369209 -0.266410 0.019365 2.188827e-02 0.079275 -0.009859 -0.039924 -0.021787 0.053008 1.000000 7.402175e-01 0.170642 0.021893
revenue 0.005964 -0.165188 -0.244319 -0.228217 0.008612 1.531468e-07 0.080477 -0.029358 -0.035989 -0.000009 0.060832 0.740218 1.000000e+00 0.150170 0.000003
company 0.021873 -0.077689 -0.072600 0.005697 -0.002250 -2.884303e-02 -0.012979 -0.006643 -0.001965 0.015757 0.051130 0.170642 1.501704e-01 1.000000 -0.028843
runtime -0.023528 -0.078892 -0.022337 0.027598 -0.023429 1.000000e+00 0.991399 -0.023898 -0.023931 -0.023581 -0.057982 0.021893 2.708556e-06 -0.028843 1.000000
In [27]:
sns.heatmap(correlation_matrix_2, annot=True)

plt.title('Correlation Matrix for Non-Numeric features')
plt.xlabel('Features')
plt.ylabel('Features')

print("⚠️NOTE: The brighter the box, the higher the correlation.")
⚠️NOTE: The brighter the box, the higher the correlation.
In [28]:
correlation_matrix_2 = df_numerised.corr(method='pearson')
corr_pairs = correlation_matrix_2.unstack()
corr_pairs
Out[28]:
name     name        1.000000
         rating     -0.019973
         genre       0.010138
         year       -0.017417
         released   -0.005342
                       ...   
runtime  country    -0.057982
         budget      0.021893
         revenue     0.000003
         company    -0.028843
         runtime     1.000000
Length: 225, dtype: float64
In [29]:
sorted_pairs = corr_pairs.sort_values()
high_corr = sorted_pairs[(sorted_pairs)>0.5] 
high_corr
Out[29]:
budget    revenue     0.740218
revenue   budget      0.740218
votes     score       0.991399
score     votes       0.991399
runtime   votes       0.991399
votes     runtime     0.991399
score     runtime     1.000000
runtime   score       1.000000
name      name        1.000000
director  director    1.000000
revenue   revenue     1.000000
budget    budget      1.000000
country   country     1.000000
star      star        1.000000
writer    writer      1.000000
votes     votes       1.000000
score     score       1.000000
released  released    1.000000
year      year        1.000000
genre     genre       1.000000
rating    rating      1.000000
company   company     1.000000
runtime   runtime     1.000000
dtype: float64
In [ ]: